pacman::p_load(tidyverse, jsonlite, SmartEDA, tidygraph, ggraph)In-class Exercise 05
Loading Packages
Importing Knowledge Graph Data
kg <- fromJSON("MC1_graph.json")Inspect Structure
str(kg,max.level=1)List of 5
$ directed : logi TRUE
$ multigraph: logi TRUE
$ graph :List of 2
$ nodes :'data.frame': 17412 obs. of 10 variables:
$ links :'data.frame': 37857 obs. of 4 variables:
Extract and Inspect
nodes_tb1 <- as_tibble(kg$nodes)
edges_tb1 <- as_tibble(kg$links)Initial EDA
ggplot(data = edges_tb1,
aes(y =`Edge Type`)) +
geom_bar()
ggplot(data = nodes_tb1,
aes(y =`Node Type`)) +
geom_bar()
Creating Knowledge Graph
This is
Step 1: Mapping from node id to row index
id_map<- tibble(id= nodes_tb1$id,
index = seq_len(
nrow(nodes_tb1)))This ensures each d from your node list is mapped to the correct row number.
Step 2 : Map source and target IDs to row indices
edges_tb1<- edges_tb1 %>%
left_join(id_map,by = c("source"="id"))%>%
rename(from= index)%>%
left_join(id_map,by = c("target" = "id"))%>%
rename(to = index)Step3: Filter out any unmatched(invalid) edges
edges_tb1 = edges_tb1 %>%
filter(!is.na(from),!is.na(to))Step 4: creating the graph
Lastly,tbl_grph() is used to create tidygraph’s graph object by using the code chunk below.
graph = tbl_graph(nodes = nodes_tb1,
edges = edges_tb1,
directed = TRUE)Visualising the knowledge graph
set.seed(1234)Visualising the whole Graph
ggraph(graph,layout = "fr") +
geom_edge_link(alpha = 0.3,
colour = "gray")+
geom_node_point(aes(color = `Node Type`),
size = 4) +
geom_node_text(aes(label = name),
repel = TRUE,
size = 2.5) +
theme_void()Step 1: Filter edges to onliy “Memberof”
graph_memberof <- graph %>%
activate(edges) %>%
filter(`Edge Type` == "MemberOf")Step 2: Extract only connected nodes(i.e. used in these edges)
used_node_indices<- graph_memberof %>%
activate(edges) %>%
as_tibble() %>%
select(from, to) %>%
unlist() %>%
unique()Step 3: Keep only those nodes
graph_memberof <- graph_memberof %>%
activate(nodes) %>%
mutate(row_id = row_number()) %>%
filter(row_id %in% used_node_indices) %>%
select(-row_id) #optional cleanupPlot the sub-graph
ggraph(graph_memberof,
layout = "fr") +
geom_edge_link(alpha = 0.5,
colour = "gray") +
geom_node_point(aes(color = `Node Type`),
size = 1) +
geom_node_text(aes(label = name),
repel = TRUE,
size = 2.5) +
theme_void()Warning: ggrepel: 789 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
